import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_theme()
monthly_1990 = pd.read_csv('./data/monthly_1990.csv', index_col='date')
qtr_2000 = pd.read_csv('./data/quarterly_2000.csv', index_col='date')
hp_idx = pd.read_csv('./data/hpi_monthly.csv', index_col='date')
hp_idx_qtr = pd.read_csv('./data/hpi_quarterly.csv', index_col='date')
datasets = [monthly_1990, qtr_2000, hp_idx, hp_idx_qtr]
for d in datasets:
d.index = pd.to_datetime(d.index).normalize()
## rename quarterly index
hp_idx_qtr.rename(columns={'hp_idx': 'hp_idx_qtr'}, inplace=True)
x_mnth = monthly_1990.join(hp_idx, how='inner')
x_mnth
| spend | permits | permit_val | starts | completions | manufactured | new_for_sale | months_supply | emratio | pop_level | gdp | mortgage_rate | fed_fund_rate | disp_income | pm_save | consump_durable | new_sold | hp_idx | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||||||||
| 1993-01-01 | 194150 | 989 | 5383.028 | 1091 | 975.0 | 248 | 265 | 5.4 | 61.4 | 193962 | 99.759963 | 7.9925 | 2.66 | 7237.6 | 425.6 | 538.1 | 596 | 76.784 |
| 1993-02-01 | 194689 | 953 | 5844.533 | 1063 | 1043.0 | 248 | 263 | 5.3 | 61.4 | 194108 | 99.679238 | 7.6825 | 3.25 | 7271.8 | 441.5 | 524.2 | 604 | 76.838 |
| 1993-03-01 | 190185 | 881 | 8227.729 | 950 | 983.0 | 241 | 268 | 5.4 | 61.5 | 194248 | 99.598357 | 7.4950 | 3.65 | 7249.2 | 439.0 | 521.2 | 602 | 76.868 |
| 1993-04-01 | 198296 | 922 | 8918.858 | 1110 | 1067.0 | 241 | 270 | 4.7 | 61.5 | 194398 | 99.525442 | 7.4720 | 3.31 | 7286.8 | 436.4 | 543.0 | 701 | 76.937 |
| 1993-05-01 | 198835 | 911 | 8320.702 | 1128 | 973.0 | 240 | 273 | 5.3 | 61.7 | 194549 | 99.466172 | 7.4650 | 3.01 | 7276.3 | 415.8 | 552.0 | 626 | 77.037 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2020-08-01 | 638430 | 1055 | 21995.756 | 1023 | 895.0 | 95 | 286 | 3.5 | 56.5 | 260558 | 96.957495 | 2.9350 | 0.10 | 15553.9 | 2598.4 | 1754.4 | 1036 | 222.539 |
| 2020-09-01 | 657613 | 1121 | 23179.135 | 1105 | 933.0 | 96 | 286 | 2.5 | 56.6 | 260742 | 97.144271 | 2.8900 | 0.09 | 15643.4 | 2506.2 | 1774.6 | 991 | 225.793 |
| 2020-10-01 | 677725 | 1141 | 23846.574 | 1162 | 898.0 | 100 | 284 | 3.5 | 57.4 | 260925 | 97.335974 | 2.8340 | 0.09 | 15568.4 | 2370.9 | 1794.0 | 1001 | 229.403 |
| 2020-11-01 | 697825 | 1155 | 19839.048 | 1182 | 913.0 | 101 | 290 | 4.0 | 57.4 | 261085 | 97.545301 | 2.7650 | 0.09 | 15366.5 | 2239.3 | 1763.7 | 851 | 232.673 |
| 2020-12-01 | 719089 | 1233 | 22196.074 | 1315 | 958.0 | 104 | 299 | 3.8 | 57.4 | 261230 | 97.920930 | 2.6840 | 0.09 | 15393.8 | 2426.2 | 1719.9 | 871 | 235.700 |
348 rows × 18 columns
x_qtr = qtr_2000.join(hp_idx_qtr, how='inner')
x_qtr
| owner_vacancy | rent_vacancy | week_earning | delinquent_rate | hor | hp_idx_qtr | |
|---|---|---|---|---|---|---|
| date | ||||||
| 2000-01-01 | 1.6 | 7.9 | 603 | 1.95 | 67.100000 | 101.339333 |
| 2000-04-01 | 1.5 | 8.0 | 606 | 1.97 | 67.200000 | 103.674333 |
| 2000-07-01 | 1.6 | 8.2 | 611 | 2.09 | 67.700000 | 105.786000 |
| 2000-10-01 | 1.6 | 7.8 | 614 | 2.23 | 67.500000 | 108.274667 |
| 2001-01-01 | 1.5 | 8.2 | 620 | 2.34 | 67.500000 | 110.484667 |
| ... | ... | ... | ... | ... | ... | ... |
| 2020-10-01 | 1.0 | 6.5 | 1029 | 2.73 | 65.822503 | 232.592000 |
| 2021-01-01 | 0.9 | 6.8 | 1040 | 2.68 | 65.600000 | 242.141667 |
| 2021-04-01 | 0.9 | 6.2 | 1048 | 2.48 | 65.400000 | 254.633000 |
| 2021-07-01 | 0.9 | 5.8 | 1068 | 2.31 | 65.400000 | 266.878333 |
| 2021-10-01 | 0.9 | 5.6 | 1069 | 2.28 | 65.500000 | 276.591000 |
88 rows × 6 columns
x_mnth.describe()
| spend | permits | permit_val | starts | completions | manufactured | new_for_sale | months_supply | emratio | pop_level | gdp | mortgage_rate | fed_fund_rate | disp_income | pm_save | consump_durable | new_sold | hp_idx | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 | 348.000000 |
| mean | 388427.977011 | 978.086207 | 13436.704716 | 1021.948276 | 1008.341954 | 163.853448 | 309.563218 | 5.547989 | 61.448563 | 228248.660920 | 99.924844 | 5.770086 | 2.611494 | 11060.189080 | 690.200575 | 1056.994540 | 721.683908 | 141.397727 |
| std | 131817.306373 | 350.756496 | 5004.263997 | 364.981422 | 350.616782 | 110.566859 | 97.260866 | 1.786734 | 2.226594 | 20452.252824 | 1.154711 | 1.608015 | 2.242938 | 2267.430612 | 571.627593 | 273.791779 | 263.823162 | 42.368958 |
| min | 190185.000000 | 337.000000 | 4187.117000 | 353.000000 | 368.000000 | 40.000000 | 142.000000 | 2.500000 | 51.300000 | 193962.000000 | 91.580035 | 2.684000 | 0.040000 | 7236.400000 | 193.400000 | 521.200000 | 270.000000 | 76.784000 |
| 25% | 262888.500000 | 678.750000 | 9384.458500 | 717.250000 | 713.500000 | 75.000000 | 256.250000 | 4.200000 | 59.550000 | 211956.500000 | 99.569831 | 4.195000 | 0.200000 | 9406.300000 | 379.075000 | 903.225000 | 504.750000 | 102.723250 |
| 50% | 365905.000000 | 1014.500000 | 12819.002500 | 1106.500000 | 1059.000000 | 113.000000 | 300.000000 | 5.100000 | 62.300000 | 228791.500000 | 99.947313 | 5.930250 | 2.040000 | 11196.000000 | 453.250000 | 1077.200000 | 703.000000 | 144.786500 |
| 75% | 507872.750000 | 1232.250000 | 16737.504500 | 1277.750000 | 1262.000000 | 249.500000 | 342.000000 | 6.300000 | 63.100000 | 246221.250000 | 100.512188 | 7.067500 | 5.055000 | 12495.975000 | 938.875000 | 1197.700000 | 893.000000 | 175.919500 |
| max | 719089.000000 | 1798.000000 | 28099.586000 | 1823.000000 | 1911.000000 | 390.000000 | 570.000000 | 12.200000 | 64.700000 | 261230.000000 | 101.829671 | 9.198000 | 7.800000 | 17170.700000 | 6392.500000 | 1794.000000 | 1389.000000 | 235.700000 |
x_qtr.describe()
| owner_vacancy | rent_vacancy | week_earning | delinquent_rate | hor | hp_idx_qtr | |
|---|---|---|---|---|---|---|
| count | 88.000000 | 88.000000 | 88.000000 | 88.000000 | 88.000000 | 88.000000 |
| mean | 1.905682 | 8.484091 | 800.806818 | 4.641250 | 66.426392 | 166.134367 |
| std | 0.520909 | 1.455381 | 124.448379 | 3.249938 | 1.836107 | 36.548143 |
| min | 0.900000 | 5.600000 | 603.000000 | 1.410000 | 62.900000 | 101.339333 |
| 25% | 1.600000 | 7.000000 | 696.750000 | 2.145000 | 64.800000 | 141.844500 |
| 50% | 1.800000 | 8.600000 | 789.500000 | 2.915000 | 66.900000 | 163.591667 |
| 75% | 2.225000 | 9.800000 | 881.500000 | 7.152500 | 68.000000 | 184.103583 |
| max | 2.900000 | 11.100000 | 1069.000000 | 11.500000 | 69.200000 | 276.591000 |
def detect_outlier(df_in:pd.DataFrame):
df_out = df_in.copy()
for c in df_out.columns:
q1 = df_out[c].quantile(0.25)
q3 = df_out[c].quantile(0.75)
iqr = q3 - q1
a, b = q1-1.5*iqr, q3+1.5*iqr
df_out['out_' + c] = (df_out[c] < a) | (df_out[c] > b)
return df_out
plt.close('all')
for col in x_mnth.columns:
f, ax = plt.subplots()
sns.histplot(x_mnth[[col]], ax=ax, kde=True)
plt.close('all')
for col in x_qtr.columns:
f, ax = plt.subplots()
sns.histplot(x_qtr[[col]], ax=ax, kde=True)
x_mnth_out = detect_outlier(x_mnth)
plt.close('all')
for col in x_mnth.columns:
sns.relplot(data=x_mnth_out.reset_index(), x='date', y=col, hue='out_' + col, kind='line', height=10, aspect=2)
x_qtr_out = detect_outlier(x_qtr)
plt.close('all')
for col in x_qtr.columns:
sns.relplot(data=x_qtr_out.reset_index(), x='date', y=col, hue='out_' + col, kind='line', height=10, aspect=2)
plt.close('all')
for col in x_mnth.columns[:-1]:
f, ax = plt.subplots(figsize=(16, 8))
x_mnth.plot(y=col, legend=False, ax=ax)
ax2 = ax.twinx()
x_mnth.plot(y="hp_idx", ax=ax2, legend=False, color="r")
ax.figure.legend()
plt.show()
plt.close('all')
for col in x_qtr.columns[:-1]:
f, ax = plt.subplots(figsize=(16, 8))
x_qtr.plot(y=col, legend=False, ax=ax)
ax2 = ax.twinx()
x_qtr.plot(y="hp_idx_qtr", ax=ax2, legend=False, color="r")
ax.figure.legend()
plt.show()
x_qtr_monthly = x_qtr.resample('MS').interpolate(method='linear')
x_qtr_monthly.head()
| owner_vacancy | rent_vacancy | week_earning | delinquent_rate | hor | hp_idx_qtr | |
|---|---|---|---|---|---|---|
| date | ||||||
| 2000-01-01 | 1.600000 | 7.900000 | 603.000000 | 1.950000 | 67.100000 | 101.339333 |
| 2000-02-01 | 1.566667 | 7.933333 | 604.000000 | 1.956667 | 67.133333 | 102.117667 |
| 2000-03-01 | 1.533333 | 7.966667 | 605.000000 | 1.963333 | 67.166667 | 102.896000 |
| 2000-04-01 | 1.500000 | 8.000000 | 606.000000 | 1.970000 | 67.200000 | 103.674333 |
| 2000-05-01 | 1.533333 | 8.066667 | 607.666667 | 2.010000 | 67.366667 | 104.378222 |
### join dataframes
x_monthly = x_mnth.join(x_qtr_monthly, how='inner')
### save for use
x_monthly.to_csv('./data/x_monthly.csv', index_label='date')
corr = x_monthly.corr()
mask = np.triu(corr)
sns.set(font_scale=1.2)
_, ax = plt.subplots(figsize=(25, 20))
p = sns.heatmap(corr,annot=True, ax=ax, mask=mask)